Easier than R usual filter methods.
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
## [1] 22
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 3: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 4: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 5: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 6: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 7: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 8: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 9: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 10: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 11: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 12: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 13: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 14: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 3: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 4: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 5: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 6: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 7: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 8: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 9: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 10: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 11: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 12: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 13: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 14: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 3: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 4: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 5: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 6: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 7: 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 8: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 9: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 10: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 11: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 12: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 13: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 14: 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 15: 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 16: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 17: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 18: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 19: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 20: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 21: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 22: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 23: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 24: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 25: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 26: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 27: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 28: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 29: 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 30: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## mpg cyl disp hp drat wt qsec vs am gear carb
%like%
allows you to search for a pattern in a character or a factor vector
## [1] TRUE TRUE FALSE
## [1] FALSE FALSE TRUE
## [1] TRUE TRUE FALSE
batrips <- as.data.table(batrips)
# Subset all rows where start_station starts with San Francisco
batrips[start_station %like% "^San Francisco"]
## trip_id duration start_date
## 1: 139545 435 2014-01-01 00:14:00
## 2: 139546 432 2014-01-01 00:14:00
## 3: 139594 981 2014-01-01 03:43:00
## 4: 139626 1423 2014-01-01 10:09:00
## 5: 139629 737 2014-01-01 10:12:00
## ---
## 42417: 588867 1092 2014-12-31 18:15:00
## 42418: 588868 1029 2014-12-31 18:16:00
## 42419: 588872 1037 2014-12-31 18:30:00
## 42420: 588898 979 2014-12-31 22:06:00
## 42421: 588902 677 2014-12-31 22:09:00
## start_station start_terminal
## 1: San Francisco City Hall 58
## 2: San Francisco City Hall 58
## 3: San Francisco City Hall 58
## 4: San Francisco Caltrain 2 (330 Townsend) 69
## 5: San Francisco Caltrain 2 (330 Townsend) 69
## ---
## 42417: San Francisco City Hall 58
## 42418: San Francisco City Hall 58
## 42419: San Francisco Caltrain (Townsend at 4th) 70
## 42420: San Francisco City Hall 58
## 42421: San Francisco Caltrain (Townsend at 4th) 70
## end_date end_station
## 1: 2014-01-01 00:21:00 Townsend at 7th
## 2: 2014-01-01 00:21:00 Townsend at 7th
## 3: 2014-01-01 03:59:00 San Francisco City Hall
## 4: 2014-01-01 10:33:00 San Francisco Caltrain 2 (330 Townsend)
## 5: 2014-01-01 10:24:00 Harry Bridges Plaza (Ferry Building)
## ---
## 42417: 2014-12-31 18:33:00 Powell at Post (Union Square)
## 42418: 2014-12-31 18:33:00 Powell at Post (Union Square)
## 42419: 2014-12-31 18:47:00 Broadway St at Battery St
## 42420: 2014-12-31 22:22:00 Clay at Battery
## 42421: 2014-12-31 22:20:00 Grant Avenue at Columbus Avenue
## end_terminal bike_id subscription_type zip_code
## 1: 65 473 Subscriber 94612
## 2: 65 395 Subscriber 94107
## 3: 58 273 Customer 89503
## 4: 69 309 Subscriber 94105
## 5: 50 584 Subscriber 94107
## ---
## 42417: 71 633 Customer 93245
## 42418: 71 408 Customer 93245
## 42419: 82 613 Subscriber 94133
## 42420: 41 385 Subscriber 94111
## 42421: 73 440 Subscriber 94133
## trip_id duration start_date
## 1: 139545 435 2014-01-01 00:14:00
## 2: 139546 432 2014-01-01 00:14:00
## 3: 139594 981 2014-01-01 03:43:00
## 4: 139626 1423 2014-01-01 10:09:00
## 5: 139629 737 2014-01-01 10:12:00
## ---
## 42417: 588867 1092 2014-12-31 18:15:00
## 42418: 588868 1029 2014-12-31 18:16:00
## 42419: 588872 1037 2014-12-31 18:30:00
## 42420: 588898 979 2014-12-31 22:06:00
## 42421: 588902 677 2014-12-31 22:09:00
## start_station start_terminal
## 1: San Francisco City Hall 58
## 2: San Francisco City Hall 58
## 3: San Francisco City Hall 58
## 4: San Francisco Caltrain 2 (330 Townsend) 69
## 5: San Francisco Caltrain 2 (330 Townsend) 69
## ---
## 42417: San Francisco City Hall 58
## 42418: San Francisco City Hall 58
## 42419: San Francisco Caltrain (Townsend at 4th) 70
## 42420: San Francisco City Hall 58
## 42421: San Francisco Caltrain (Townsend at 4th) 70
## end_date end_station
## 1: 2014-01-01 00:21:00 Townsend at 7th
## 2: 2014-01-01 00:21:00 Townsend at 7th
## 3: 2014-01-01 03:59:00 San Francisco City Hall
## 4: 2014-01-01 10:33:00 San Francisco Caltrain 2 (330 Townsend)
## 5: 2014-01-01 10:24:00 Harry Bridges Plaza (Ferry Building)
## ---
## 42417: 2014-12-31 18:33:00 Powell at Post (Union Square)
## 42418: 2014-12-31 18:33:00 Powell at Post (Union Square)
## 42419: 2014-12-31 18:47:00 Broadway St at Battery St
## 42420: 2014-12-31 22:22:00 Clay at Battery
## 42421: 2014-12-31 22:20:00 Grant Avenue at Columbus Avenue
## end_terminal bike_id subscription_type zip_code
## 1: 65 473 Subscriber 94612
## 2: 65 395 Subscriber 94107
## 3: 58 273 Customer 89503
## 4: 69 309 Subscriber 94105
## 5: 50 584 Subscriber 94107
## ---
## 42417: 71 633 Customer 93245
## 42418: 71 408 Customer 93245
## 42419: 82 613 Subscriber 94133
## 42420: 41 385 Subscriber 94111
## 42421: 73 440 Subscriber 94133
%between%
allows you to search for values in the closed interval [val1, val2]
## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## trip_id duration start_date
## 1: 139640 2203 2014-01-01 10:38:00
## 2: 139672 2421 2014-01-01 11:41:00
## 3: 139673 2300 2014-01-01 11:44:00
## 4: 139674 2284 2014-01-01 11:44:00
## 5: 139676 2096 2014-01-01 11:47:00
## ---
## 3596: 587365 2326 2014-12-29 15:48:00
## 3597: 588010 2568 2014-12-30 12:03:00
## 3598: 588009 2569 2014-12-30 12:04:00
## 3599: 588852 2107 2014-12-31 17:37:00
## 3600: 588897 2379 2014-12-31 22:03:00
## start_station start_terminal
## 1: Steuart at Market 74
## 2: Washington at Kearney 46
## 3: Washington at Kearney 46
## 4: Washington at Kearney 46
## 5: Washington at Kearney 46
## ---
## 3596: Powell at Post (Union Square) 71
## 3597: Harry Bridges Plaza (Ferry Building) 50
## 3598: Harry Bridges Plaza (Ferry Building) 50
## 3599: Townsend at 7th 65
## 3600: Paseo de San Antonio 7
## end_date end_station end_terminal bike_id
## 1: 2014-01-01 11:14:00 Embarcadero at Sansome 60 341
## 2: 2014-01-01 12:22:00 Embarcadero at Bryant 54 477
## 3: 2014-01-01 12:22:00 Embarcadero at Bryant 54 367
## 4: 2014-01-01 12:22:00 Embarcadero at Bryant 54 419
## 5: 2014-01-01 12:22:00 Embarcadero at Bryant 54 628
## ---
## 3596: 2014-12-29 16:27:00 Davis at Jackson 42 402
## 3597: 2014-12-30 12:46:00 Embarcadero at Sansome 60 453
## 3598: 2014-12-30 12:46:00 Embarcadero at Sansome 60 497
## 3599: 2014-12-31 18:12:00 2nd at Folsom 62 560
## 3600: 2014-12-31 22:42:00 Paseo de San Antonio 7 296
## subscription_type zip_code
## 1: Subscriber 94103
## 2: Customer 94583
## 3: Customer 90026
## 4: Customer 90026
## 5: Customer 91354
## ---
## 3596: Subscriber 94111
## 3597: Customer 55
## 3598: Customer 55
## 3599: Subscriber 94582
## 3600: Customer 91387
## trip_id duration start_date
## 1: 139640 2203 2014-01-01 10:38:00
## 2: 139672 2421 2014-01-01 11:41:00
## 3: 139673 2300 2014-01-01 11:44:00
## 4: 139674 2284 2014-01-01 11:44:00
## 5: 139676 2096 2014-01-01 11:47:00
## ---
## 3596: 587365 2326 2014-12-29 15:48:00
## 3597: 588010 2568 2014-12-30 12:03:00
## 3598: 588009 2569 2014-12-30 12:04:00
## 3599: 588852 2107 2014-12-31 17:37:00
## 3600: 588897 2379 2014-12-31 22:03:00
## start_station start_terminal
## 1: Steuart at Market 74
## 2: Washington at Kearney 46
## 3: Washington at Kearney 46
## 4: Washington at Kearney 46
## 5: Washington at Kearney 46
## ---
## 3596: Powell at Post (Union Square) 71
## 3597: Harry Bridges Plaza (Ferry Building) 50
## 3598: Harry Bridges Plaza (Ferry Building) 50
## 3599: Townsend at 7th 65
## 3600: Paseo de San Antonio 7
## end_date end_station end_terminal bike_id
## 1: 2014-01-01 11:14:00 Embarcadero at Sansome 60 341
## 2: 2014-01-01 12:22:00 Embarcadero at Bryant 54 477
## 3: 2014-01-01 12:22:00 Embarcadero at Bryant 54 367
## 4: 2014-01-01 12:22:00 Embarcadero at Bryant 54 419
## 5: 2014-01-01 12:22:00 Embarcadero at Bryant 54 628
## ---
## 3596: 2014-12-29 16:27:00 Davis at Jackson 42 402
## 3597: 2014-12-30 12:46:00 Embarcadero at Sansome 60 453
## 3598: 2014-12-30 12:46:00 Embarcadero at Sansome 60 497
## 3599: 2014-12-31 18:12:00 2nd at Folsom 62 560
## 3600: 2014-12-31 22:42:00 Paseo de San Antonio 7 296
## subscription_type zip_code
## 1: Subscriber 94103
## 2: Customer 94583
## 3: Customer 90026
## 4: Customer 90026
## 5: Customer 91354
## ---
## 3596: Subscriber 94111
## 3597: Customer 55
## 3598: Customer 55
## 3599: Subscriber 94582
## 3600: Customer 91387
%chin%
is similar to %in%
, but it is much faster and only for character vectors.
# Subset all rows where start_station is
# "Japantown", "Mezes Park" or "MLK Library"
batrips[start_station %chin% c("Japantown", "Mezes Park", "MLK Library")]
## trip_id duration start_date start_station start_terminal
## 1: 140683 426 2014-01-02 16:07:00 MLK Library 11
## 2: 140707 516 2014-01-02 16:21:00 Japantown 9
## 3: 140787 333 2014-01-02 16:51:00 MLK Library 11
## 4: 141377 936 2014-01-03 09:08:00 Japantown 9
## 5: 141476 467 2014-01-03 10:08:00 MLK Library 11
## ---
## 1901: 588368 34595 2014-12-31 01:10:00 MLK Library 11
## 1902: 588429 692 2014-12-31 07:29:00 MLK Library 11
## 1903: 588430 599 2014-12-31 07:30:00 MLK Library 11
## 1904: 588766 3237 2014-12-31 15:31:00 Japantown 9
## 1905: 588765 3249 2014-12-31 15:31:00 Japantown 9
## end_date end_station end_terminal
## 1: 2014-01-02 16:14:00 San Jose Diridon Caltrain Station 2
## 2: 2014-01-02 16:30:00 Paseo de San Antonio 7
## 3: 2014-01-02 16:56:00 Adobe on Almaden 5
## 4: 2014-01-03 09:24:00 San Jose Diridon Caltrain Station 2
## 5: 2014-01-03 10:15:00 Paseo de San Antonio 7
## ---
## 1901: 2014-12-31 10:47:00 MLK Library 11
## 1902: 2014-12-31 07:40:00 San Jose Diridon Caltrain Station 2
## 1903: 2014-12-31 07:40:00 San Jose Diridon Caltrain Station 2
## 1904: 2014-12-31 16:25:00 Japantown 9
## 1905: 2014-12-31 16:25:00 Japantown 9
## bike_id subscription_type zip_code
## 1: 657 Subscriber 94043
## 2: 188 Subscriber 95112
## 3: 140 Subscriber 94536
## 4: 11 Subscriber 95112
## 5: 182 Subscriber 95035
## ---
## 1901: 241 Customer 95076
## 1902: 180 Subscriber 95112
## 1903: 682 Subscriber 95113
## 1904: 253 Customer 95112
## 1905: 702 Customer 95112
## trip_id duration start_date start_station start_terminal
## 1: 140683 426 2014-01-02 16:07:00 MLK Library 11
## 2: 140707 516 2014-01-02 16:21:00 Japantown 9
## 3: 140787 333 2014-01-02 16:51:00 MLK Library 11
## 4: 141377 936 2014-01-03 09:08:00 Japantown 9
## 5: 141476 467 2014-01-03 10:08:00 MLK Library 11
## ---
## 1901: 588368 34595 2014-12-31 01:10:00 MLK Library 11
## 1902: 588429 692 2014-12-31 07:29:00 MLK Library 11
## 1903: 588430 599 2014-12-31 07:30:00 MLK Library 11
## 1904: 588766 3237 2014-12-31 15:31:00 Japantown 9
## 1905: 588765 3249 2014-12-31 15:31:00 Japantown 9
## end_date end_station end_terminal
## 1: 2014-01-02 16:14:00 San Jose Diridon Caltrain Station 2
## 2: 2014-01-02 16:30:00 Paseo de San Antonio 7
## 3: 2014-01-02 16:56:00 Adobe on Almaden 5
## 4: 2014-01-03 09:24:00 San Jose Diridon Caltrain Station 2
## 5: 2014-01-03 10:15:00 Paseo de San Antonio 7
## ---
## 1901: 2014-12-31 10:47:00 MLK Library 11
## 1902: 2014-12-31 07:40:00 San Jose Diridon Caltrain Station 2
## 1903: 2014-12-31 07:40:00 San Jose Diridon Caltrain Station 2
## 1904: 2014-12-31 16:25:00 Japantown 9
## 1905: 2014-12-31 16:25:00 Japantown 9
## bike_id subscription_type zip_code
## 1: 657 Subscriber 94043
## 2: 188 Subscriber 95112
## 3: 140 Subscriber 94536
## 4: 11 Subscriber 95112
## 5: 182 Subscriber 95035
## ---
## 1901: 241 Customer 95076
## 1902: 180 Subscriber 95112
## 1903: 682 Subscriber 95113
## 1904: 253 Customer 95112
## 1905: 702 Customer 95112
## trip_id duration
## 1: 139545 435
## 2: 139546 432
## duration start_station
## 1: 435 San Francisco City Hall
## 2: 432 San Francisco City Hall
# Select all cols *except* those shown below
ans <- batrips[, -c("start_date", "end_date", "end_station")]
head(ans, 1)
## trip_id duration start_station start_terminal end_terminal
## 1: 139545 435 San Francisco City Hall 58 65
## bike_id subscription_type zip_code
## 1: 473 Subscriber 94612
## When one column is select, with list always return a data table, not a vector
ans <- batrips[, list(trip_id, dur = duration)]
head(ans, 2)
## trip_id dur
## 1: 139545 435
## 2: 139546 432
## trip_id duration
## 1: 139545 435
## 2: 139546 432
# Compute mean of duration column using the data.table way
ans <- batrips[, mean(duration)]
# Compute mean of duration column for "Japantown" start station
batrips[start_station == "Japantown", mean(duration)]
## [1] 2464.331
## [1] 902
# Get median duration after filtering
median_duration_filter <- batrips[end_station == "Market at 10th" & subscription_type == "Subscriber", median(duration)]
median_duration_filter
## [1] 651
# Compute duration of all trips
trip_duration <- batrips[, difftime(end_date, start_date)]
head(trip_duration)
## Time differences in mins
## [1] 7 7 25 27 27 13
## Multiple computation
# Get mean and median of duration
batrips[start_station == "Japantown", .(mn_dur = mean(duration),
med_dur = median(duration))]
## mn_dur med_dur
## 1: 2464.331 782
##
duration_stats <- batrips[start_station == "Townsend at 7th" & duration < 500,
.(min_dur = min(duration),
max_dur = max(duration))]
duration_stats
## min_dur max_dur
## 1: 62 499
## $breaks
## [1] 50 100 150 200 250 300 350 400 450 500
##
## $counts
## [1] 28 15 792 2042 920 314 314 497 538
##
## $density
## [1] 1.025641e-04 5.494505e-05 2.901099e-03 7.479853e-03 3.369963e-03
## [6] 1.150183e-03 1.150183e-03 1.820513e-03 1.970696e-03
##
## $mids
## [1] 75 125 175 225 275 325 375 425 475
##
## $xname
## [1] "duration"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# How many trips happened from each start_station?
ans <- batrips[, .N, by = "start_station"]
head(ans, 3)
## start_station N
## 1: San Francisco City Hall 2145
## 2: Embarcadero at Sansome 12879
## 3: Steuart at Market 11579
## start no_trips
## 1: San Francisco City Hall 2145
## 2: Embarcadero at Sansome 12879
## 3: Steuart at Market 11579
# Get number of trips for each start_station for each month
ans <- batrips[ , .N, by = .(start_station, mon = month(start_date))]
head(ans, 3)
## start_station mon N
## 1: San Francisco City Hall 1 193
## 2: Embarcadero at Sansome 1 985
## 3: Steuart at Market 1 813
# Compute the mean duration for every start_station
mean_start_stn <- batrips[, .(mean_duration = mean(duration)), by = .(start_station)]
mean_start_stn
## start_station mean_duration
## 1: San Francisco City Hall 1893.9361
## 2: Embarcadero at Sansome 1418.1822
## 3: Steuart at Market 956.9007
## 4: 5th at Howard 845.0584
## 5: Harry Bridges Plaza (Ferry Building) 1516.3534
## 6: Beale at Market 856.6453
## 7: Embarcadero at Folsom 853.6766
## 8: 2nd at South Park 697.7034
## 9: Santa Clara at Almaden 954.2218
## 10: Powell Street BART 1332.3455
## 11: Howard at 2nd 739.9716
## 12: 2nd at Townsend 840.9178
## 13: South Van Ness at Market 3800.7443
## 14: 2nd at Folsom 551.0807
## 15: Market at 4th 1272.1251
## 16: Market at 10th 1073.2136
## 17: Market at Sansome 828.6980
## 18: Embarcadero at Bryant 992.9516
## 19: Temporary Transbay Terminal (Howard at Beale) 655.8563
## 20: Civic Center BART (7th at Market) 1287.9282
## 21: San Francisco Caltrain 2 (330 Townsend) 701.5882
## 22: Grant Avenue at Columbus Avenue 1244.8732
## 23: Paseo de San Antonio 1813.1946
## 24: San Jose Civic Center 3553.7180
## 25: University and Emerson 5434.5844
## 26: Townsend at 7th 700.8317
## 27: Embarcadero at Vallejo 1645.5448
## 28: Washington at Kearney 2335.1965
## 29: Spear at Folsom 739.5398
## 30: San Francisco Caltrain (Townsend at 4th) 818.6879
## 31: Davis at Jackson 935.8838
## 32: Clay at Battery 1200.1642
## 33: Golden Gate at Polk 1196.0425
## 34: Yerba Buena Center of the Arts (3rd @ Howard) 870.8964
## 35: Powell at Post (Union Square) 1608.1639
## 36: San Antonio Caltrain Station 2819.7316
## 37: Rengstorff Avenue / California Street 4363.0287
## 38: Cowper at University 2190.2248
## 39: Mechanics Plaza (Market at Battery) 1007.1464
## 40: Mountain View Caltrain Station 1268.4549
## 41: Adobe on Almaden 844.9003
## 42: Commercial at Montgomery 796.4848
## 43: SJSU - San Salvador at 9th 936.7344
## 44: Post at Kearney 936.8625
## 45: California Ave Caltrain Station 4294.2704
## 46: St James Park 938.1094
## 47: Mountain View City Hall 1558.2308
## 48: San Salvador at 1st 1100.7312
## 49: Evelyn Park and Ride 1212.1923
## 50: San Jose Diridon Caltrain Station 856.9951
## 51: Redwood City Caltrain Station 4221.3608
## 52: Palo Alto Caltrain Station 3219.7698
## 53: San Jose City Hall 1000.9194
## 54: SJSU 4th at San Carlos 2095.5141
## 55: Park at Olive 3704.5740
## 56: Arena Green / SAP Center 2049.5070
## 57: San Pedro Square 968.7299
## 58: MLK Library 1885.4802
## 59: Japantown 2464.3315
## 60: Broadway at Main 3473.1842
## 61: San Jose Government Center 1068.1739
## 62: Castro Street and El Camino Real 1830.9644
## 63: San Mateo County Center 4045.9633
## 64: San Antonio Shopping Center 1372.8279
## 65: Franklin at Maple 803.4824
## 66: Redwood City Medical Center 1912.2763
## 67: Redwood City Public Library 3474.0093
## 68: Broadway St at Battery St 882.9592
## 69: Mezes Park 770.5893
## 70: Washington at Kearny 1885.8746
## 71: Post at Kearny 1034.7064
## 72: Santa Clara County Civic Center 1378.6996
## 73: Ryland Park 1407.0793
## 74: Stanford in Redwood City 1878.1800
## start_station mean_duration
# Compute the mean duration grouped by start_station and month
mean_start_station <- batrips[, .(mean_duration = mean(duration)), by = .(start_station, month(start_date))]
mean_start_station
## start_station month mean_duration
## 1: San Francisco City Hall 1 1548.2591
## 2: Embarcadero at Sansome 1 952.1756
## 3: Steuart at Market 1 757.2448
## 4: 5th at Howard 1 599.3433
## 5: Harry Bridges Plaza (Ferry Building) 1 1429.0666
## ---
## 830: California Ave Caltrain Station 12 4230.0000
## 831: University and Emerson 12 7771.2917
## 832: SJSU - San Salvador at 9th 12 652.9333
## 833: San Mateo County Center 12 5033.5714
## 834: Redwood City Public Library 12 496.5000
## trip_id duration start_date
## 1: 295912 3601 2014-05-23 11:18:00
## 2: 347471 3602 2014-07-01 19:33:00
## 3: 536050 3602 2014-11-08 15:41:00
## start_station start_terminal end_date
## 1: Harry Bridges Plaza (Ferry Building) 50 2014-05-23 12:18:00
## 2: Clay at Battery 41 2014-07-01 20:33:00
## 3: Market at 10th 67 2014-11-08 16:41:00
## end_station end_terminal bike_id
## 1: Harry Bridges Plaza (Ferry Building) 50 512
## 2: Embarcadero at Sansome 60 288
## 3: Harry Bridges Plaza (Ferry Building) 50 332
## subscription_type zip_code
## 1: Customer 95757
## 2: Customer 7009
## 3: Customer 64112
## trip_id duration start_date
## 1: 295912 3601 2014-05-23 11:18:00
## 2: 347471 3602 2014-07-01 19:33:00
## 3: 536050 3602 2014-11-08 15:41:00
## start_station start_terminal end_date
## 1: Harry Bridges Plaza (Ferry Building) 50 2014-05-23 12:18:00
## 2: Clay at Battery 41 2014-07-01 20:33:00
## 3: Market at 10th 67 2014-11-08 16:41:00
## end_station end_terminal bike_id
## 1: Harry Bridges Plaza (Ferry Building) 50 512
## 2: Embarcadero at Sansome 60 288
## 3: Harry Bridges Plaza (Ferry Building) 50 332
## subscription_type zip_code
## 1: Customer 95757
## 2: Customer 7009
## 3: Customer 64112
##
# Three start stations with the lowest mean duration
batrips[, .(mn_dur = mean(duration)),
by = "start_station"][order(mn_dur)][1:3]
## start_station mn_dur
## 1: 2nd at Folsom 551.0807
## 2: Temporary Transbay Terminal (Howard at Beale) 655.8563
## 3: 2nd at South Park 697.7034
# Compute most popular end station for every start station
trips_dec <- batrips[,(.N), by = .(start_station, end_station)]
popular_end_station <- trips_dec[, .(end_station = end_station[1]),
by = start_station]
# Find the first and last ride for each start_station
first_last <- batrips[,
.(start_date = start_date[c(1,.N)]),
by = start_station]
uniqueN()
is a helper function that returns an integer value containing the number of unique values in the input objec
## [1] 2
## [1] 4
## [1] 2
## val1 val2
## 1: 1 f
## 2: 2 e
## 3: 5 b
## 4: 6 a
## val1 val2
## 1: 3 d
## 2: 4 c
## Empty data.table (0 rows and 1 cols): id
# .SDcols controls the columns .SD contains
batrips[, .SD[1], by = start_station, .SDcols = c("trip_id", "duration")]
## start_station trip_id duration
## 1: San Francisco City Hall 139545 435
## 2: Embarcadero at Sansome 139547 1523
## 3: Steuart at Market 139549 1620
## 4: 5th at Howard 139554 624
## 5: Harry Bridges Plaza (Ferry Building) 139558 1600
## 6: Beale at Market 139567 477
## 7: Embarcadero at Folsom 139573 966
## 8: 2nd at South Park 139577 901
## 9: Santa Clara at Almaden 139582 198
## 10: Powell Street BART 139585 478
## 11: Howard at 2nd 139587 52577
## 12: 2nd at Townsend 139590 497
## 13: South Van Ness at Market 139593 967
## 14: 2nd at Folsom 139595 3723
## 15: Market at 4th 139600 662
## 16: Market at 10th 139605 1352
## 17: Market at Sansome 139607 5987
## 18: Embarcadero at Bryant 139610 769
## 19: Temporary Transbay Terminal (Howard at Beale) 139618 264
## 20: Civic Center BART (7th at Market) 139625 1026
## 21: San Francisco Caltrain 2 (330 Townsend) 139626 1423
## 22: Grant Avenue at Columbus Avenue 139638 788
## 23: Paseo de San Antonio 139647 29793
## 24: San Jose Civic Center 139648 9876
## 25: University and Emerson 139665 4982
## 26: Townsend at 7th 139667 1075
## 27: Embarcadero at Vallejo 139669 695
## 28: Washington at Kearney 139672 2421
## 29: Spear at Folsom 139675 445
## 30: San Francisco Caltrain (Townsend at 4th) 139681 889
## 31: Davis at Jackson 139685 337
## 32: Clay at Battery 139686 804
## 33: Golden Gate at Polk 139688 3431
## 34: Yerba Buena Center of the Arts (3rd @ Howard) 139693 1081
## 35: Powell at Post (Union Square) 139706 317
## 36: San Antonio Caltrain Station 139715 652
## 37: Rengstorff Avenue / California Street 139724 1184
## 38: Cowper at University 139741 7912
## 39: Mechanics Plaza (Market at Battery) 139770 3289
## 40: Mountain View Caltrain Station 139782 1704
## 41: Adobe on Almaden 139902 869
## 42: Commercial at Montgomery 139937 824
## 43: SJSU - San Salvador at 9th 139942 243
## 44: Post at Kearney 139947 709
## 45: California Ave Caltrain Station 139967 160
## 46: St James Park 140012 3535
## 47: Mountain View City Hall 140014 49605
## 48: San Salvador at 1st 140018 409
## 49: Evelyn Park and Ride 140061 213
## 50: San Jose Diridon Caltrain Station 140062 682
## 51: Redwood City Caltrain Station 140168 227
## 52: Palo Alto Caltrain Station 140298 181
## 53: San Jose City Hall 140400 177
## 54: SJSU 4th at San Carlos 140413 255
## 55: Park at Olive 140417 1053
## 56: Arena Green / SAP Center 140435 205
## 57: San Pedro Square 140458 184
## 58: MLK Library 140683 426
## 59: Japantown 140707 516
## 60: Broadway at Main 140738 88
## 61: San Jose Government Center 140847 413
## 62: Castro Street and El Camino Real 140848 318
## 63: San Mateo County Center 141384 125
## 64: San Antonio Shopping Center 141633 1104
## 65: Franklin at Maple 142257 1305
## 66: Redwood City Medical Center 145785 204
## 67: Redwood City Public Library 146481 917
## 68: Broadway St at Battery St 161546 289
## 69: Mezes Park 192526 588
## 70: Washington at Kearny 198785 880
## 71: Post at Kearny 198954 737
## 72: Santa Clara County Civic Center 206440 169308
## 73: Ryland Park 243456 330
## 74: Stanford in Redwood City 437188 443
## start_station trip_id duration
relevant_cols <- c("start_station", "end_station",
"start_date", "end_date", "duration")
# Find the row corresponding to the shortest trip per month
shortest <- batrips[, .SD[which.min(duration)],
by = month(start_date),
.SDcols = relevant_cols]
# Find the total number of unique start stations and zip codes per month
unique_station_month <- batrips[, lapply(.SD, uniqueN),
by = .(month = month(start_date)),
.SDcols = c("start_station", "zip_code")]
batrips[, c("is_dur_gt_1hour", "week_day") := list(duration > 3600,
wday(start_date))]
# When adding a single column quotes aren't necessary
batrips[, is_dur_gt_1hour := duration > 3600]
## Functional form
batrips[, `:=`(is_dur_gt_1hour = NULL, ## Delete the column
start_station = toupper(start_station))]
# Fix spelling in the second row of start_station
# untidy[,start_station := ifelse(start_station %like% c("Sen"),"San Francisco City Hall",start_station)]
batrips[, n_zip_code := .N, by = zip_code]
zip_1000 <- batrips[n_zip_code > 1000][, n_zip_code := NULL]
# Same as
zip_1000 <- batrips[, n_zip_code := .N,
by = zip_code][n_zip_code > 1000][, n_zip_code := NULL]
# Add new column for every start_station and end_station
batrips[, duration_mean := mean(duration), by = .(start_station,end_station)]
## Replace NAs with the mean of duration
batrips[, mean_dur := mean(duration, na.rm = TRUE),
by = month(start_date)][is.na(duration),
duration := mean_dur][, mean_dur := NULL]
batrips[, trip_category := {
med_dur = median(duration, na.rm = TRUE)
if (med_dur < 600) "short"
else if (med_dur >= 600 & med_dur <= 1800) "medium"
else "long"
},
by = .(start_station, end_station)]
batrips[1:3]
## trip_id duration start_date start_station
## 1: 139545 435 2014-01-01 00:14:00 SAN FRANCISCO CITY HALL
## 2: 139546 432 2014-01-01 00:14:00 SAN FRANCISCO CITY HALL
## 3: 139547 1523 2014-01-01 00:17:00 EMBARCADERO AT SANSOME
## start_terminal end_date end_station end_terminal bike_id
## 1: 58 2014-01-01 00:21:00 Townsend at 7th 65 473
## 2: 58 2014-01-01 00:21:00 Townsend at 7th 65 395
## 3: 60 2014-01-01 00:42:00 Beale at Market 56 331
## subscription_type zip_code week_day n_zip_code duration_mean
## 1: Subscriber 94612 4 1228 678.6364
## 2: Subscriber 94107 4 36061 678.6364
## 3: Subscriber 94112 4 2168 651.2367
## trip_category
## 1: short
## 2: short
## 3: short
## Fread function
## Fread saves memoery with the int64 format instead of numeric format
# File frmo URL
DT1 <- fread("https://bit.ly/2RkBXhV",
nrows = 1)
#File from string
DT2 <- fread("a,b\n1,2\n3,4")
# Skip first two lines containing metadata
str <- "# Metadata\nTimestamp: 2018-05-01 19:44:28 GMT\na,b\n1,2\n3,4"
fread(str, skip = 2)
## a b
## 1: 1 2
## 2: 3 4
## V1 V3
## 1: 1 x
## 2: 3 y
## V1 V3
## 1: 1 x
## 2: 3 y
str <- "x1,x2,x3,x4,x5\n1,2,1.5,true,cc\n3,4,2.5,false,ff"
ans <- fread(str, colClasses = c(x5 = "factor"))
ans <- fread(str, colClasses = c("integer", "integer",
"numeric", "logical", "factor"))
str(ans)
## Classes 'data.table' and 'data.frame': 2 obs. of 5 variables:
## $ x1: int 1 3
## $ x2: int 2 4
## $ x3: num 1.5 2.5
## $ x4: logi TRUE FALSE
## $ x5: Factor w/ 2 levels "cc","ff": 1 2
## - attr(*, ".internal.selfref")=<externalptr>
str <- "x1,x2,x3,x4,x5,x6\n1,2,1.5,2.5,aa,bb\n3,4,5.5,6.5,cc,dd"
ans <- fread(str, colClasses = list(numeric = 1:4, factor = c("x5", "x6")))
str(ans)
## Classes 'data.table' and 'data.frame': 2 obs. of 6 variables:
## $ x1: num 1 3
## $ x2: num 2 4
## $ x3: num 1.5 5.5
## $ x4: num 2.5 6.5
## $ x5: Factor w/ 2 levels "aa","cc": 1 2
## $ x6: Factor w/ 2 levels "bb","dd": 1 2
## - attr(*, ".internal.selfref")=<externalptr>
## x1 x2 x3 x4 x5 x6
## 1: 1 2 1.5 2.5 aa bb
## 2: 3 4 5.5 6.5 cc dd
##
str <- "x,y,z\n1,###,3\n2,4,###\n#N/A,7,9"
ans <- fread(str, na.strings = c("###", "#N/A"), nThread = 1)
ans
## x y z
## 1: 1 NA 3
## 2: 2 4 NA
## 3: NA 7 9
## Fwrite
now <- Sys.time()
dt <- data.table(date = as.IDate(now),
time = as.ITime(now),
datetime = now)
dt
## date time datetime
## 1: 2019-09-22 19:15:57 2019-09-22 14:15:57
## date time datetime
## 1: 2019-09-22 19:15:57 2019-09-22T19:15:57.536595Z
## date time datetime
## 1: 20190922 191557 20190922191557536
## date time datetime
## 1: 18161 69357 1569179758